Discovering contextual* links by counting top inlinked URLs
*contextual: links within content, and not part of navigation, header, footer, etc
Code and data: https://github.com/eliasdabbas/crawlytics_demo
Code
import advertools as adv
import adviz
import pandas as pd
import plotly.express as px
import networkx as nx
from IPython.display import display_markdown
pd.options.display.max_columns = None
from dash_bootstrap_templates import load_figure_template
load_figure_template('all')
def md(text):
return display_markdown(f'{text}', raw=True)adv.crawl(
url_list='https://www.nasa.gov/',
output_file='nasa_crawl.jl',
follow_links=True,
custom_settings={
'CLOSESPIDER_PAGECOUNT': 1000,
'LOG_FILE': 'nasa_crawl.log',
'JOBDIR': 'nasa_crawl',
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_TARGET_CONCURRENCY': 6
})
# convert from .jl to .parquet:
adv.crawlytics.jl_to_parquet('nasa_crawl.jl', 'nasa_crawl.parquet')Code
crawldf = pd.read_parquet('nasa_crawl.parquet')
crawldf.head(3)| url | title | meta_desc | viewport | charset | h2 | h3 | canonical | alt_href | og:locale | og:type | og:title | og:description | og:url | og:site_name | og:updated_time | og:image | og:image:secure_url | og:image:width | og:image:height | og:image:alt | og:image:type | og:video | og:video_1 | og:video_2 | og:video_3 | og:video_4 | og:video_5 | og:video_6 | og:video_7 | og:video_8 | og:video_9 | og:video_10 | og:video_11 | og:video_12 | og:video_13 | og:video_14 | og:video_15 | og:video_16 | og:video_17 | og:video_18 | og:video_19 | og:video_20 | og:video_21 | og:video_22 | og:video_23 | twitter:card | twitter:title | twitter:description | twitter:image | jsonld_@context | jsonld_@graph | body_text | size | download_timeout | download_slot | download_latency | depth | status | links_url | links_text | links_nofollow | nav_links_url | nav_links_text | nav_links_nofollow | header_links_url | header_links_text | header_links_nofollow | footer_links_url | footer_links_text | footer_links_nofollow | img_fetchpriority | img_width | img_height | img_alt | img_src | img_srcset | img_decoding | img_sizes | img_loading | ip_address | crawl_time | resp_headers_Content-Length | resp_headers_Server | resp_headers_Date | resp_headers_Content-Type | resp_headers_Host-Header | resp_headers_X-Launch-Status | resp_headers_Link | resp_headers_X-Rq | resp_headers_Cache-Control | resp_headers_Age | resp_headers_X-Cache | resp_headers_Vary | resp_headers_Accept-Ranges | resp_headers_Strict-Transport-Security | request_headers_Accept | request_headers_Accept-Language | request_headers_User-Agent | request_headers_Accept-Encoding | h1 | h4 | request_headers_Referer | twitter:label1 | twitter:data1 | twitter:label2 | twitter:data2 | redirect_times | redirect_ttl | redirect_urls | redirect_reasons | resp_headers_Last-Modified | resp_headers_Etag | resp_headers_Access-Control-Allow-Origin | resp_headers_Access-Control-Allow-Methods | resp_headers_Content-Disposition | twitter:image:alt | twitter:image:width | twitter:image:height | resp_headers_X-Powered-By | h5 | jsonld_@type | jsonld_headline | jsonld_url | jsonld_thumbnailUrl | jsonld_articleSection | jsonld_author | jsonld_creator | jsonld_keywords | jsonld_dateCreated | jsonld_datePublished | jsonld_dateModified | jsonld_mainEntityOfPage.@type | jsonld_mainEntityOfPage.@id | jsonld_image.@type | jsonld_image.url | jsonld_publisher.@type | jsonld_publisher.name | jsonld_publisher.logo | resp_headers_X-Hacker | img_align | resp_headers_X-Content-Type-Options | resp_headers_X-Xss-Protection | resp_headers_Content-Security-Policy | resp_headers_Set-Cookie | resp_headers_Expires | resp_headers_X-Frame-Options | resp_headers_Server-Timing | resp_headers_Timing-Allow-Origin | request_headers_Cookie | resp_headers_X-Amz-Id-2 | resp_headers_X-Amz-Request-Id | resp_headers_X-Amz-Replication-Status | resp_headers_X-Amz-Server-Side-Encryption | resp_headers_X-Amz-Meta-Cb-Modifiedtime | resp_headers_X-Amz-Version-Id | resp_headers_X-Ua-Compatible | resp_headers_Referrer-Policy | resp_headers_Via | resp_headers_X-Amz-Cf-Pop | resp_headers_X-Amz-Cf-Id | h6 | img_border | img_hspace | img_vspace | resp_headers_Content-Language | resp_headers_Cf-Cache-Status | resp_headers_Cf-Ray | img_ismap | img_usemap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | https://www.nasa.gov/ | NASA | NASA.gov brings you the latest news, images and videos from America's space agency, pioneering the future in space e... | width=device-width, initial-scale=1 | UTF-8 | Suggested Searches@@Martians Wanted@@Featured News@@NASA’s SpaceX Crew-8@@Image Of The Day@@Image Of The Day@@Our Ch... | News & Events@@Multimedia@@Featured@@Teams Add Iconic NASA ‘Worm’ Logo to Artemis II Rocket, Spacecraft@@Flame Burns... | https://www.nasa.gov/ | https://www.nasa.gov/feed/@@https://www.nasa.gov/wp-json/wp/v2/pages/128943@@https://www.nasa.gov/wp-json/oembed/1.0... | en_US | website | NASA | NASA.gov brings you the latest news, images and videos from America's space agency, pioneering the future in space e... | https://www.nasa.gov/ | NASA | 2024-02-17T01:14:00-05:00 | https://www.nasa.gov/wp-content/uploads/2018/07/174116main_2006_01777_highres.jpg | https://www.nasa.gov/wp-content/uploads/2018/07/174116main_2006_01777_highres.jpg | 640 | 512 | NASA Meatball paint refresh | image/jpeg | https://www.youtube.com/embed/21X5lGlDOfg | https://www.youtube.com/embed/NpHFB_DYXhY | https://www.youtube.com/embed/_LJHRpDvPCw | https://www.youtube.com/embed/bTQjiMtpMG0 | https://www.nasa.gov/wp-content/uploads/2023/11/final-nasa-15-sec-horizontal-16-9.mp4 | https://www.youtube.com/embed/1fOWosS_f1Y | https://www.youtube.com/embed/31b1yjUBlO0 | https://www.youtube.com/embed/MTyzq4ey9RE | https://www.youtube.com/embed/OffTxAiAQfM | https://www.youtube.com/embed/ZbBx4sW68uw | https://www.youtube.com/embed/vUYcQ_ehArw | https://www.youtube.com/embed/YQWespzOtzI | https://www.youtube.com/embed/CRZYw9fEBe4 | https://www.youtube.com/embed/IGuHErKAiHs | https://www.youtube.com/embed/R-TOoGTvFL8 | https://www.youtube.com/embed/p566jU9pylY | https://www.youtube.com/embed/_tdsia6EZY8 | https://www.youtube.com/embed/hW5akI5Rnyg | https://www.youtube.com/embed/WQR_iNjEjlw | https://www.youtube.com/embed/iDAKTLmt2hs | https://www.youtube.com/embed/VwVL0UBVVLA | https://www.youtube.com/embed/Ha4mXufQp6c | https://www.youtube.com/embed/sgp_2OBxKeM | https://www.youtube.com/embed/UyXS2tYggiE | summary_large_image | NASA | NASA.gov brings you the latest news, images and videos from America's space agency, pioneering the future in space e... | https://www.nasa.gov/wp-content/uploads/2018/07/174116main_2006_01777_highres.jpg | https://schema.org | [{'@id': 'https://www.nasa.gov/#organization', '@type': 'Organization', 'about': None, 'articleSection': None, 'auth... | \n\t\t\t\t\t \n\t\t\t\t\t\t Explore \n\t\t\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t\t\t N... | 299218 | 180 | www.nasa.gov | 0.064869 | 0 | 200 | https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https... | \n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https://plus.nasa.gov/series/@@https://www.nasa.gov... | \n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https... | \n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | https://www.nasa.gov/about/@@https://www.nasa.gov/get-involved/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@... | About NASA's Mission@@\n\t\t\t\t\t\t\tJoin Us\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t@@Home@@News & Events@@Multimedia@@NASA+@... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | high@@high@@high@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@high@@@@@@@@@@@@@@@@@@@@@@@@... | 60@@60@@640@@640@@640@@@@640@@640@@640@@640@@640@@640@@@@@@@@@@@@@@@@640@@@@@@@@1920@@1920@@640@@640@@@@@@640@@640@@... | 50.58@@50.58@@960@@519@@360@@@@427@@960@@427@@960@@519@@427@@@@@@@@@@@@@@@@360@@@@@@@@1280@@1627@@360@@640@@@@@@481@... | NASA Logo@@NASA Logo@@@@A sample of fabric burns inside Spacecraft Fire Experiment-IV (Saffire-IV). The sample is a ... | https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png@@https://www.nasa.gov/wp-content/themes/n... | https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo.svg@@https://www.nasa.gov/wp-content/themes/nasa... | @@@@async@@async@@async@@@@async@@async@@async@@async@@async@@async@@@@@@@@@@@@@@@@async@@@@@@@@async@@async@@async@... | @@@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@@@(max-width:... | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@eager@@lazy@@@@@@@@lazy@@lazy@@lazy@@laz... | 192.0.66.108 | 2024-02-19 08:46:43 | 31913.0 | nginx | Mon, 19 Feb 2024 08:46:43 GMT | text/html; charset=UTF-8 | a9130478a60e5f9135f765b23f26593b | Go Flight! | <https://www.nasa.gov/wp-json/>; rel="https://api.w.org/",<https://www.nasa.gov/wp-json/wp/v2/pages/128943>; rel="al... | hhn1 85 187 443 | max-age=300, must-revalidate | 666.0 | hit | Accept-Encoding | bytes | max-age=31536000 | text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 | en | advertools/0.14.0 | gzip, deflate, br | None | None | None | None | None | None | None | NaN | NaN | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 1 | https://www.nasa.gov/?search=SpaceX%20Crew-2 | 3522 Search Results for "SpaceX Crew-2" | None | width=device-width, initial-scale=1 | UTF-8 | Suggested Searches@@\n\t\t\t\t3522 results found\t\t\t\t\t | News & Events@@Multimedia@@Featured@@Teams Add Iconic NASA ‘Worm’ Logo to Artemis II Rocket, Spacecraft@@Flame Burns... | None | https://www.nasa.gov/feed/ | en_US | website | None | None | None | NASA | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | summary_large_image | None | None | None | https://schema.org | [{'@id': 'https://www.nasa.gov/#organization', '@type': 'Organization', 'about': None, 'articleSection': None, 'auth... | \n\t\t\t\t\t \n\t\t\t\t\t\t Explore \n\t\t\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t\t\t N... | 234115 | 180 | www.nasa.gov | 1.301717 | 1 | 200 | https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https... | \n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https://plus.nasa.gov/series/@@https://www.nasa.gov... | \n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https... | \n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | https://www.nasa.gov/about/@@https://www.nasa.gov/get-involved/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@... | About NASA's Mission@@\n\t\t\t\t\t\t\tJoin Us\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t@@Home@@News & Events@@Multimedia@@NASA+@... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | high@@high@@high@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@... | 60@@60@@640@@640@@640@@@@640@@640@@640@@640@@640@@640@@@@@@@@@@@@@@@@640@@@@@@@@1920@@1920@@640@@640@@@@@@640@@640@@... | 50.58@@50.58@@960@@519@@360@@@@427@@960@@427@@960@@519@@427@@@@@@@@@@@@@@@@360@@@@@@@@1280@@1627@@360@@640@@@@@@481@... | NASA Logo@@NASA Logo@@@@A sample of fabric burns inside Spacecraft Fire Experiment-IV (Saffire-IV). The sample is a ... | https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png@@https://www.nasa.gov/wp-content/themes/n... | https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo.svg@@https://www.nasa.gov/wp-content/themes/nasa... | @@@@async@@async@@async@@@@async@@async@@async@@async@@async@@async@@@@@@@@@@@@@@@@async@@@@@@@@async@@async@@async@... | @@@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@@@(max-width:... | None | 192.0.66.108 | 2024-02-19 08:46:47 | NaN | nginx | Mon, 19 Feb 2024 08:46:47 GMT | text/html; charset=UTF-8 | a9130478a60e5f9135f765b23f26593b | Go Flight! | <https://www.nasa.gov/wp-json/>; rel="https://api.w.org/" | hhn1 85 187 443 | max-age=300, must-revalidate | 0.0 | miss | Accept-Encoding | bytes | max-age=31536000 | text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 | en | advertools/0.14.0 | gzip, deflate, br | \n\t\t\t\t\tSearch Results for: SpaceX Crew-2\t\t\t\t | The SpaceX Freedom Dragon crew ship with the Axiom Mission-2 crew - NASA@@NASA's SpaceX Crew-7@@The SpaceX Freedom D... | https://www.nasa.gov/ | None | None | None | None | NaN | NaN | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
| 2 | https://www.nasa.gov/?search=International%20Space%20Station | 28773 Search Results for "International Space Station" | None | width=device-width, initial-scale=1 | UTF-8 | Suggested Searches@@\n\t\t\t\t28773 results found\t\t\t\t\t | News & Events@@Multimedia@@Featured@@Teams Add Iconic NASA ‘Worm’ Logo to Artemis II Rocket, Spacecraft@@Flame Burns... | None | https://www.nasa.gov/feed/ | en_US | website | None | None | None | NASA | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | summary_large_image | None | None | None | https://schema.org | [{'@id': 'https://www.nasa.gov/#organization', '@type': 'Organization', 'about': None, 'articleSection': None, 'auth... | \n\t\t\t\t\t \n\t\t\t\t\t\t Explore \n\t\t\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t \n\t\t\t\t\t \n\t\t\t\t\t\t N... | 233543 | 180 | www.nasa.gov | 2.509646 | 1 | 200 | https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https... | \n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https://plus.nasa.gov/series/@@https://www.nasa.gov... | \n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | https://www.nasa.gov/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@https://www.nasa.gov/news/all-news/@@https... | \n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\n\t\t\t@@\n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t@@\n\t\t\t\... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | https://www.nasa.gov/about/@@https://www.nasa.gov/get-involved/@@https://www.nasa.gov/@@https://www.nasa.gov/news/@@... | About NASA's Mission@@\n\t\t\t\t\t\t\tJoin Us\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t@@Home@@News & Events@@Multimedia@@NASA+@... | False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@False@@Fals... | high@@high@@high@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@... | 60@@60@@640@@640@@640@@@@640@@640@@640@@640@@640@@640@@@@@@@@@@@@@@@@640@@@@@@@@1920@@1920@@640@@640@@@@@@640@@640@@... | 50.58@@50.58@@960@@519@@360@@@@427@@960@@427@@960@@519@@427@@@@@@@@@@@@@@@@360@@@@@@@@1280@@1627@@360@@640@@@@@@481@... | NASA Logo@@NASA Logo@@@@A sample of fabric burns inside Spacecraft Fire Experiment-IV (Saffire-IV). The sample is a ... | https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo@2x.png@@https://www.nasa.gov/wp-content/themes/n... | https://www.nasa.gov/wp-content/themes/nasa/assets/images/nasa-logo.svg@@https://www.nasa.gov/wp-content/themes/nasa... | @@@@async@@async@@async@@@@async@@async@@async@@async@@async@@async@@@@@@@@@@@@@@@@async@@@@@@@@async@@async@@async@... | @@@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@(max-width: 640px) 100vw, 640px@@@@(max-width:... | None | 192.0.66.108 | 2024-02-19 08:46:47 | NaN | nginx | Mon, 19 Feb 2024 08:46:47 GMT | text/html; charset=UTF-8 | a9130478a60e5f9135f765b23f26593b | Go Flight! | <https://www.nasa.gov/wp-json/>; rel="https://api.w.org/" | hhn1 85 188 443 | max-age=300, must-revalidate | 0.0 | miss | Accept-Encoding | bytes | max-age=31536000 | text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 | en | advertools/0.14.0 | gzip, deflate, br | \n\t\t\t\t\tSearch Results for: International Space Station\t\t\t\t | International Space Station - NASA@@20 Years of Observing Earth from the International Space Station - NASA@@SpaceX ... | https://www.nasa.gov/ | None | None | None | None | NaN | NaN | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None | None |
Code
md(f'#### Rows: {crawldf.shape[0]:,}. Columns: {crawldf.shape[1]}')Rows: 9,942. Columns: 170
Get a mapping of all links on the website adv.crawlytics.links
link_df = adv.crawlytics.links(crawldf, internal_url_regex=r'nasa\.gov')
link_df| url | link | text | nofollow | internal | |
|---|---|---|---|---|---|
| 0 | https://www.nasa.gov/ | https://www.nasa.gov/ | \n\t\t\t\t\n\t\t\t | False | True |
| 0 | https://www.nasa.gov/ | https://www.nasa.gov/ | \n\t\t\t\t\n\t\t\t | False | True |
| 0 | https://www.nasa.gov/ | https://www.nasa.gov/news/ | \n\t\t\t\t\t\t\t\tNews & Events\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t | False | True |
| 0 | https://www.nasa.gov/ | https://www.nasa.gov/news/all-news/ | \n\t\t\t\t\t\t\t\t\t\tAll NASA News\n\t\t\t\t\t\t\t\t\t | False | True |
| 0 | https://www.nasa.gov/ | https://plus.nasa.gov/series/ | \n\t\t\t\t\t\t\t\t\t\tVideo Series on NASA+\n\t\t\t\t\t\t\t\t\t | False | True |
| ... | ... | ... | ... | ... | ... |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | http://oig.nasa.gov/ | Office of the IG | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/budgets-plans-and-reports/ | Budget & Annual Reports | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/organizations/budget-annual-reports/agency-financial-reports/ | Agency Financial Reports | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/contact-nasa/ | Contact NASA | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/accessibility/ | Accessibility | False | True |
1999297 rows × 5 columns
Incoming links per page
Find the most linked-to pages, exclude them from the table to get the inner links (within article text, product description, etc.)
Code
inlink_counts = link_df['link'].value_counts().reset_index()
inlink_counts| link | count | |
|---|---|---|
| 0 | https://plus.nasa.gov/ | 40460 |
| 1 | https://www.nasa.gov/ | 33633 |
| 2 | https://www.nasa.gov/about/ | 32388 |
| 3 | https://science.nasa.gov/earth/ | 32346 |
| 4 | https://www.nasa.gov/podcasts/ | 32278 |
| ... | ... | ... |
| 47854 | https://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Fwww.nasa.gov%2Fimage-detail%2Fp-38-integration-191%2F | 1 |
| 47855 | https://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.nasa.gov%2Fimage-detail%2Fp-38-integration-335%2F | 1 |
| 47856 | https://www.pinterest.com/pin-builder/?description=Aaron&media=https%3A%2F%2Fwww.nasa.gov%2Fwp-content%2Fuploads%2F2... | 1 |
| 47857 | https://twitter.com/intent/tweet?via=NASA&text=Aaron&url=https%3A%2F%2Fwww.nasa.gov%2Fimage-detail%2Fp-38-integratio... | 1 |
| 47858 | https://www.nasa.gov/media-tags/sfd/ | 1 |
47859 rows × 2 columns
Code
import plotly.graph_objects as go
fig = go.Figure()
fig.add_scattergl(
x=list(range(len(inlink_counts))),
y=inlink_counts['count'],
text=inlink_counts['link'],
hovertemplate="<b>%{text}</b><br><br>Incoming links: %{y:,}",
marker={'opacity': 0.6, 'size': 8, 'symbol': 'circle-open'},
name='',
mode='markers')
fig.layout.xaxis.title = 'URL'
fig.layout.yaxis.title = 'Number of inlinks<br>log scale'
fig.layout.template = 'cosmo'
fig.layout.title = 'Number of incoming links per URL'
fig.layout.height = 600
fig.layout.yaxis.type = 'log'
figCluster pages by inlinks using KMeans clustering
Code
from sklearn.cluster import KMeans
k = 5
kmeans = KMeans(k)
kmeans.fit(inlink_counts[['count']])
sorted(kmeans.cluster_centers_.round(0).flatten())
cluster_df = (inlink_counts
.groupby(kmeans.labels_)
['count']
.describe()
.sort_values('mean')
[['min', 'mean', 'max', 'count']]
.rename(columns={'mean': 'Avg number of inlinks'})
.rename_axis('cluster'))
Code
fig = px.bar(
cluster_df,
x='Avg number of inlinks',
y='count',
log_y=True,
height=500,
template='cosmo',
hover_data=['min', 'max'],
title=f'<b>Number of Inlinks distribution({k} clusters).</b>', #<br>Points represent the average number of inlinks for a cluster of pages.',
labels={'mean': 'Average page size (bytes)',
'count': "Number of pages in cluster<br>log scale"})
fig.data[0].hovertemplate = '<b>Average number of inlinks: %{x:,.0f}</b><br><br>Number of pages in cluster: %{y:,.0f}<br><br>min: %{customdata[0]:,.0f}<br>max: %{customdata[1]:,.0f}<extra></extra>'
fig.layout.font.size = 14
fig.layout.xaxis.tickvals = cluster_df['Avg number of inlinks'].round()
fig.show()Code
display(cluster_df.style.format('{:,.0f}').background_gradient(cmap='cividis').set_caption(f'<h3>Inink cluster details ({k} clusters)</h4>'))
| min | Avg number of inlinks | max | count | |
|---|---|---|---|---|
| cluster | ||||
| 0 | 1 | 2 | 2,811 | 47,689 |
| 1 | 8,062 | 8,104 | 9,271 | 131 |
| 3 | 16,128 | 16,155 | 16,216 | 20 |
| 2 | 24,196 | 24,326 | 24,611 | 14 |
| 4 | 32,278 | 34,221 | 40,460 | 5 |
top_links = inlink_counts[inlink_counts['count'].gt(200)]['link']
contextual_links = link_df[~link_df['link'].isin(top_links)]Example 1
https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/
Code
contextual_links.loc[9941, :]| url | link | text | nofollow | internal | |
|---|---|---|---|---|---|
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | \n\t\t\t\t\tNews & Events\n\t\t\t\t\t\n\t\t\t\t | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | \n\t\t\t\t\tMultimedia\n\t\t\t\t\t\n\t\t\t\t | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/image-article/bracing-fuel-efficient-flight/ | Read More | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.nasa.gov%2Fimage-detail%2Fbracing-fuel-efficient-flight-2%2F | \n\t\t\tFacebook\n\t\t\t | True | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.pinterest.com/pin-builder/?description=Bracing%20for%20Fuel-Efficient%20Flight&media=https%3A%2F%2Fwww.n... | \n\t\t\tPinterest\n\t\t\t | True | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://twitter.com/intent/tweet?via=NASA&text=Bracing%20for%20Fuel-Efficient%20Flight&url=https%3A%2F%2Fwww.nasa.go... | \n\t\t\tX\n\t\t\t | True | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Fwww.nasa.gov%2Fimage-detail%2Fbracing-fuel-efficie... | \n\t\t\tLinkedIn\n\t\t\t | True | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/media-tags/aatt/ | Advanced Air Transport Technology | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/media-tags/aavp/ | Advanced Air Vehicles Program | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/media-tags/aeronautics/ | Aeronautics | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/media-tags/iasp/ | Integrated Aviation Systems Program | False | True |
| 9941 | https://www.nasa.gov/image-detail/bracing-fuel-efficient-flight-2/ | https://www.nasa.gov/media-tags/sfd/ | Sustainable Flight Demonstrator | False | True |
Example 2
https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/
Code
contextual_links.loc[9938, :]| url | link | text | nofollow | internal | |
|---|---|---|---|---|---|
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | \n\t\t\t\t\tNews & Events\n\t\t\t\t\t\n\t\t\t\t | False | True |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | \n\t\t\t\t\tMultimedia\n\t\t\t\t\t\n\t\t\t\t | False | True |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://x.com/intent/tweet?via=NASA&text=NASA%E2%80%99s%20ICON%20Explores%20the%20Boundary%20Between%20Earth%20and%2... | \n\t\t\t\t\n\t\t\t | False | True |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.nasa.gov%2Fmissions%2Ficon%2Fnasas-icon-explores-the-boundar... | \n\t\t\t\t\n\t\t\t | False | True |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Fwww.nasa.gov%2Fmissions%2Ficon%2Fnasas-icon-explor... | \n\t\t\t\t\n\t\t\t | False | True |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://svs.gsfc.nasa.gov/12699 | Download this video in HD formats from NASA’s Goddard Space Flight Center’s Scientific Visualization Studio | False | True |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://www.nasa.gov/nasalive | NASA TV | False | True |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://www.nasa.gov/content/icon-spacecraft-and-instruments | MIGHTI instrument | False | True |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://www.nasa.gov/icon | NASA’s ICON website | False | True |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | http://icon.ssl.berkeley.edu/ | UC Berkeley’s ICON website | False | False |
| 9938 | https://www.nasa.gov/missions/icon/nasas-icon-explores-the-boundary-between-earth-and-space/ | https://www.nasa.gov/goddard | NASA’s Goddard Space Flight Center | False | True |
Example 3
https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/
Code
contextual_links.loc[1465, :]| url | link | text | nofollow | internal | |
|---|---|---|---|---|---|
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | \n\t\t\t\t\tNews & Events\n\t\t\t\t\t\n\t\t\t\t | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | \n\t\t\t\t\tMultimedia\n\t\t\t\t\t\n\t\t\t\t | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://x.com/intent/tweet?via=NASA&text=50%20Years%20Ago%3A%20NASA%20Names%20Apollo%2016%20Crew&url=https%3A%2F%2Fw... | \n\t\t\t\t\n\t\t\t | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.facebook.com/sharer.php?u=https%3A%2F%2Fwww.nasa.gov%2Fhistory%2F50-years-ago-nasa-names-apollo-16-crew%2F | \n\t\t\t\t\n\t\t\t | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.linkedin.com/shareArticle?mini=true&url=https%3A%2F%2Fwww.nasa.gov%2Fhistory%2F50-years-ago-nasa-names-a... | \n\t\t\t\t\n\t\t\t | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://historycollection.jsc.nasa.gov/JSCHistoryPortal/history/oral_histories/MattinglyTK/mattinglytk.htm | Thomas K. Mattingly | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/historycollection.jsc.nasa.gov/JSCHistoryPortal/history/oral_histories/DukeCM/dukecm.htm | Charles M. Duke | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/50-years-ago-apollo-10-clears-the-way-for-the-first-moon-landing/ | Apollo 10 | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/50-years-ago-apollo-13-and-german-measles/ | Apollo 13 | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/50-years-ago-one-small-step-one-giant-leap/ | Apollo 11 | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://historycollection.jsc.nasa.gov/JSCHistoryPortal/history/oral_histories/HaiseFW/haisefw.htm | Fred W. Haise | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://historycollection.jsc.nasa.gov/JSCHistoryPortal/history/oral_histories/MitchellED/mitchelled.htm | Edgar D. Mitchell | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/50-years-ago-houston-weve-had-a-problem/ | Apollo 13 | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/50-years-ago-apollo-14-splashdown-and-recovery/ | Apollo 14 | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://historycollection.jsc.nasa.gov/JSCHistoryPortal/history/oral_histories/PogueWR/poguewr.htm | William R. Pogue | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://historycollection.jsc.nasa.gov/JSCHistoryPortal/history/oral_histories/CarrGP/carrgp.htm | Gerald P. Carr | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/50-years-ago-preparations-for-apollo-14-15-and-16/ | cancelled | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/skylab-americas-first-space-station/ | Skylab | False | True |
| 1465 | https://www.nasa.gov/history/50-years-ago-nasa-names-apollo-16-crew/ | https://www.nasa.gov/history/40-years-ago-space-shuttle-enterprise-rolls-to-the-pad/ | Enterprise | False | True |